Statistiques descriptives unidimensionnelles

1) Lancés d’un dé :

In [187]:
import numpy as np
import matplotlib.pyplot as plt
#plt.figure (figsize = (15 ,18))

%matplotlib inline 
#plt.figure (figsize = (15 ,18))
plt.rcParams['figure.figsize']=(20,10)
#for n in [10, 20, 50, 100, 500, 2000]:
    
    # s = np.random.randint(1,7,n)
    #print(s)
    #hist, bins, patches = plt.hist(s, bins=[1, 2, 3, 4,5,6,7])
    #plt.show()
    #labels = '1', '2', '3', '4', '5','6'
    #explode = (0.1, 0, 0, 0, 0, 0)
    #plt.pie(hist, explode=explode, labels=labels, autopct='%1.1f%%',shadow=True,startangle=90)
    #plt.axis('equal')
    #plt.show()
    # plt.show()
#Determine les proportion des valeurs avec la fonction his

Pour chaque expérience on calcule les indicateurs suivants sur la répartition des classes :

  • le pourcentage minimum
  • le pourcentage maximum
  • la différence entre le pourcentage maximum et le pourcentage minimum
  • l’écart type des pourcentages
In [183]:
Xmin=[]
Xmax=[]
Xsd =[]
Dff = []
for n in [10, 20, 50, 100, 500, 2000]:
    
    s = np.random.randint(1,7,n)
    print(n)
    #hist, bins, patches = plt.hist(s, bins=[1, 2, 3, 4, 5, 6, 7],normed=True)
    hist = np.histogram(s, bins=[1, 2, 3, 4, 5, 6, 7],normed=True)
    #print(hist)
    Xmin.append(min(hist[0]))
    Xmax.append(max(hist[0]))
    
    Xsd.append(np.std(hist[0])) 
    #print(hist)
    #plt.xlabel('Classes')
    # plt.ylabel('Pourcentage des classes')

    # plt.show()
    labels = '1', '2', '3', '4', '5','6'
    explode = (0.1, 0, 0, 0, 0, 0)
    plt.pie(hist[0], explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
    plt.axis('equal')
    plt.xlabel('% minimum: {0}; % maximum : {1}; Difference: {2}; Ecartype: {3}'.format(min(hist[0]),
                                            max(hist[0]), max(hist[0])-min(hist[0]), np.std(hist[0])))
    
    plt.show()
10
20
50
100
500
2000

2) histogramme normalisé et fonction de densité

1.1 ) Simulation de quelques jeus de données

N= 10 donnees de lois N(0,1)

In [180]:
mu=0
sigma= 1  # l'ecartype
X_a = np.random.normal(mu, sigma, 10)
print(X_a)
[ 0.49966827  0.3143814   0.60695775  0.42267652  2.27452505 -0.44122443
 -2.56214731  0.7502957   0.42922084  1.56907179]

N= 10 donnees de lois U([0,1])

In [181]:
X_b = np.random.uniform(0,1,10)
print(X_b)
[ 0.49527127  0.92070795  0.19032844  0.23076295  0.3513816   0.19484425
  0.57544133  0.14214983  0.39503965  0.63777319]

1.2) Simulation d'un jeu de 500 données selon une loi N(3,5) puis selon une loi U(3,5)

N= 500 de loi N(3,5)

In [182]:
X_1 = np.random.normal(3,5,500)
#print(X_1)

un histogramme des données brutes avec la fonction hist de matplotlib.pyplot

In [153]:
#import pylab as plt
count, bins, patches = plt.hist(X_1, 30)
#plt.xlabel('histogramme des données brutes') 
#plt.show()
#plt.xticks(fontsize=14)  
#plt.yticks(fontsize=14)

un histogramme normalisé (à l’aide la même fonction)

In [154]:
count, bins, patches = plt.hist(X_1, 30,normed=True)

N= 500 de loi U(3,5)

In [155]:
X_2 = np.random.uniform(5,3,500)
#print(X_2)

un histogramme des données brutes avec la fonction hist de matplotlib.pyplot

In [156]:
count, bins, patches = plt.hist(X_2, 30)

un histogramme normalisé (à l’aide la même fonction)

In [157]:
count, bins, patches = plt.hist(X_2, 30,normed=True)
In [158]:
from scipy.stats import norm
count, bins, patches = plt.hist(X_1, 30,normed=True)
#print(X_1)
#norm.pdf(X_1)
Xpdf=np.linspace(-15,15,100)
#X_1.sort()
#mesX=X_1
plt.plot(Xpdf, norm.pdf(Xpdf, np.mean(X_1),np.std(X_1) ), linewidth=2, color='g', label='norm pdf' )
plt.plot(Xpdf, norm.pdf(Xpdf,loc=3,scale=np.sqrt(5)) , linewidth=2, color='r', label='norm pdf' )
Out[158]:
[<matplotlib.lines.Line2D at 0x7fe3bd69bb10>]
In [159]:
from scipy.stats import uniform
count, bins, patches = plt.hist(X_2, 30,normed=True)
#print(X_1)
#norm.pdf(X_1)
Xpdfu=np.linspace(3,5,100)
#X_1.sort()
#mesX=X_1

plt.plot(Xpdfu, uniform.pdf(Xpdfu,loc=3,scale=2) , linewidth=2, color='r' , label='uniform pdf')
plt.show()

2) Illustration du théorème de la limite centrale (TCL)

p=10, n=5

In [160]:
X5_bar = []
for p in range(1,11) :
    X_i = np.random.uniform(5,3,5)
    x_bar = np.mean(X_i)
    X5_bar.append(x_bar)
    count, bins, patches = plt.hist(X_i,normed=True)
    Xpdfu=np.linspace(3,5,100)
    #print(np.mean(X_i))
    #plt.plot(Xpdf, uniform.pdf(Xpdfu, np.mean(X_i),np.std(X_i) ), linewidth=2, color='g', label='norm pdf' )
    plt.plot(Xpdfu, uniform.pdf(Xpdfu,loc=3,scale=2) , linewidth=2, color='r')
    plt.xlabel('X_i plot')
    plt.show()

The plot for X_bar gives :

In [161]:
print(X5_bar)
count, bins, patches = plt.hist(X5_bar,normed=True, color='g')
plt.xlabel('X_bar plot pour n=5')
plt.show()
[3.8489150684440383, 3.8256774016173147, 3.7921857609453498, 4.4164366742519974, 4.0419099983276334, 3.9517794079765904, 4.3991710574970142, 3.9833647521508411, 4.4163791677812281, 4.0241454484114083]

p = 10, n = 50

In [162]:
X50_bar = []

for p in range(1,11) :
    X_i = np.random.uniform(5,3,50)
    x_bar = np.mean(X_i)
    X50_bar.append(x_bar)
    count, bins, patches = plt.hist(X_i,normed=True)
    Xpdfu=np.linspace(3,5,100)
    plt.plot(Xpdfu, uniform.pdf(Xpdfu,loc=3,scale=2) , linewidth=2, color='r')
    plt.show()

The plot for X_bar gives :

In [163]:
print(X50_bar)
count, bins, patches = plt.hist(X50_bar,normed=True, color='g')
plt.xlabel('X_bar plot for n=50')
plt.show()
[3.970186126322929, 4.0512484468442453, 4.0015095722851148, 4.0464734914648792, 4.077608640439947, 3.9999770631173015, 4.1484334170812733, 4.0001399254831558, 4.0597599215696301, 4.0246892409560378]

p = 10, n = 500

In [164]:
X500_bar = []

for p in range(1,11) :
    X_i = np.random.uniform(5,3,500)
    x_bar = np.mean(X_i)
    X500_bar.append(x_bar)
    count, bins, patches = plt.hist(X_i,normed=True)
    Xpdfu=np.linspace(3,5,100)
    plt.plot(Xpdfu, uniform.pdf(Xpdfu,loc=3,scale=2) , linewidth=2, color='r')
    plt.show()
In [165]:
print(X500_bar)
count, bins, patches = plt.hist(X500_bar,normed=True, color='g')
plt.xlabel('X_bar plot for n=500')
plt.show()
[4.0184652720278224, 4.0244103470079073, 3.9497324054229765, 4.0285412864930414, 3.9783412213626388, 3.990512154993219, 4.0010580866963714, 3.9954096097719463, 4.023208462754603, 4.0029613279529865]

p = 200, n = 5

In [166]:
X5_bar = []
for p in range(1,201) :
    X_i = np.random.uniform(5,3,5)
    x_bar = np.mean(X_i)
    X5_bar.append(x_bar)
    count, bins, patches = plt.hist(X_i,normed=True)
    Xpdfu=np.linspace(3,5,100)
    #print(np.mean(X_i))
    #plt.plot(Xpdf, uniform.pdf(Xpdfu, np.mean(X_i),np.std(X_i) ), linewidth=2, color='g', label='norm pdf' )
    plt.plot(Xpdfu, uniform.pdf(Xpdfu,loc=3,scale=2) , linewidth=2, color='r')
    plt.xlabel('X_i plot')
    plt.show()
In [169]:
#print(X5_bar)
count, bins, patches = plt.hist(X5_bar,normed=True, color='g')
plt.xlabel('X_bar plot pour p= 200 n = 5')
plt.show()

p=200, n= 50

In [170]:
X50_bar = []
for p in range(1,201) :
    X_i = np.random.uniform(5,3,50)
    x_bar = np.mean(X_i)
    X50_bar.append(x_bar)
    count, bins, patches = plt.hist(X_i,normed=True)
    Xpdfu=np.linspace(3,5,100)
    #print(np.mean(X_i))
    #plt.plot(Xpdf, uniform.pdf(Xpdfu, np.mean(X_i),np.std(X_i) ), linewidth=2, color='g', label='norm pdf' )
    plt.plot(Xpdfu, uniform.pdf(Xpdfu,loc=3,scale=2) , linewidth=2, color='r')
    plt.xlabel('X_i plot')
    plt.show()
In [172]:
#print(X50_bar)
count, bins, patches = plt.hist(X50_bar,normed=True, color='g')
plt.xlabel('X_bar plot pour p = 200, n = 50')
plt.show()

p = 200, n = 500

In [173]:
X500_bar = []
for p in range(1,201) :
    X_i = np.random.uniform(5,3,500)
    x_bar = np.mean(X_i)
    X500_bar.append(x_bar)
    count, bins, patches = plt.hist(X_i,normed=True)
    Xpdfu=np.linspace(3,5,100)
    #print(np.mean(X_i))
    #plt.plot(Xpdf, uniform.pdf(Xpdfu, np.mean(X_i),np.std(X_i) ), linewidth=2, color='g', label='norm pdf' )
    plt.plot(Xpdfu, uniform.pdf(Xpdfu,loc=3,scale=2) , linewidth=2, color='r')
    plt.xlabel('X_i plot')
    plt.show()
In [175]:
#print(X500_bar)
count, bins, patches = plt.hist(X500_bar,normed=True, color='g')
plt.xlabel('X_bar plot pour p = 200, n = 500')
plt.show()

3°) Convergence de la densité empirique vers la densité théorique

In [179]:
n = 10
Xn_bar = []
for i in [ 5, 10, 20, 30, 40, 50, 100, 200, 500, 1000, 2000, 5000]:
    for p in range(1, i+1) :
        X_i = np.random.uniform(5,3,n)
        x_bar = np.mean(X_i)
        Xn_bar.append(x_bar)
        #count, bins, patches = plt.hist(X_i,normed=True)
        Xpdfu=np.linspace(3,5,100)
    plt.plot(Xpdf, norm.pdf(Xpdf, np.mean(X_i),np.std(X_i) ), linewidth=2, color='g', label='norm pdf' )
    plt.plot(Xpdfu, uniform.pdf(Xpdfu,loc=3,scale=2) , linewidth=2, color='r')
    plt.show()

Exercise 3

1) Cumuls annuels effectués sur 4 zones géographiques du Sénégal pour 51 années de 1950 à 2000

In [54]:
import pandas as pd
#df = pd.read_table('/home/foutse/Desktop/TPA_MasterTried/pluie.txt',sep = '    ', header= None) #=('Years','Zone 1', 'Zone 2', 'Zone 3', 'Zone 4'))
#data = pd.read_csv('/home/foutse/Desktop/TPA_MasterTried/pluie.txt', names=['Years','Zone 1', 'Zone 2', 'Zone 3', 'Zone 4'])
df = pd.read_csv('/home/foutse/Desktop/TPA_MasterTried/pluie.txt', sep = '      ', engine='python', header= None)
df.columns = ['Years','Zone1', 'Zone2', 'Zone3', 'Zone4']
print(df)
    Years   Zone1   Zone2    Zone3    Zone4
0    1950  889.43  840.26  1598.00  1243.80
1    1951  783.55  803.97  1367.90  1432.00
2    1952  773.30  795.44  1335.00  1235.70
3    1953  678.84  732.41  1292.50  1046.30
4    1954  668.37  641.45  1346.20  1320.80
5    1955  768.40  789.84  1522.30  1203.80
6    1956  526.62  678.46  1442.80  1033.80
7    1957  677.72  737.88  1311.30  1286.20
8    1958  751.13  787.75  1785.90  1356.60
9    1959  481.24  711.43  1021.30  1022.80
10   1960  594.14  629.05  1269.30  1076.60
11   1961  572.03  623.85  1367.20  1282.20
12   1962  545.17  602.63  1198.00  1140.40
13   1963  585.07  626.11  1148.60  1085.10
14   1964  663.46  609.74  1271.80  1194.90
15   1965  572.39  727.64  1328.40  1149.10
16   1966  648.99  717.14  1252.40  1121.30
17   1967  724.98  629.96  1445.00   973.34
18   1968  354.17  415.87   767.31   782.87
19   1969  784.58  696.41  1393.30  1041.90
20   1970  399.53  451.46  1077.50   895.79
21   1971  563.44  526.35   993.00  1014.70
22   1972  324.67  400.14   734.67   817.62
23   1973  374.41  421.89  1104.60   917.43
24   1974  487.13  533.54  1104.90  1001.50
25   1975  572.65  639.41  1402.30  1059.90
26   1976  423.84  436.59  1207.30   871.38
27   1977  354.02  411.54   801.49   808.80
28   1978  539.11  507.68  1176.80  1109.70
29   1979  466.17  454.51   973.73   874.37
30   1980  397.21  408.50   735.26   801.77
31   1981  466.01  484.65  1025.80   931.81
32   1982  430.32  406.95   901.92   825.64
33   1983  271.47  366.17   741.96   692.23
34   1984  395.00  369.46   929.03   911.26
35   1985  450.43  453.17   983.11   919.64
36   1986  411.18  456.28   943.16   965.12
37   1987  487.54  506.58   957.68   933.78
38   1988  567.75  568.82  1126.40   930.96
39   1989  590.97  549.27  1148.80   969.30
40   1990  354.76  369.87   906.52   719.58
41   1991  305.71  330.64   964.92   682.45
42   1992  356.61  371.44   926.18  1010.80
43   1993  413.27  448.01  1103.60   950.40
44   1994  470.67  565.88  1121.70  1155.10
45   1995  372.17  474.92   932.55   781.70
46   1996  292.83  387.78  1022.00   808.08
47   1997  309.37  373.36   949.70   902.25
48   1998  324.82  362.26  1106.70   956.23
49   1999  501.12  474.62  1817.30  1158.70
50   2000  447.53  559.83  1068.40   997.70
In [73]:
table_mean = [df['Zone1'].mean(), df['Zone2'].mean(), df['Zone3'].mean(), df['Zone4'].mean()]
table_min = [df['Zone1'].min(), df['Zone2'].min(), df['Zone3'].min(), df['Zone4'].min()]
table_max = [df['Zone1'].max(), df['Zone2'].max(), df['Zone3'].max(), df['Zone4'].max()]
table_std = [df['Zone1'].std(), df['Zone2'].std(), df['Zone3'].std(), df['Zone4'].std()]
table_etendu = [df['Zone1'].max() - df['Zone1'].min(),df['Zone2'].max() - df['Zone2'].min(), df['Zone3'].max() - df['Zone3'].min(), df['Zone4'].max() - df['Zone4'].min()]

print('the mean is:', table_mean)
print('the max is:',table_max)
print('the min is:',table_min)
print('the std is:',table_std)
print('the etendu is:',table_etendu)
('the mean is:', [513.04490196078427, 546.44823529411769, 1146.1468627450979, 1007.9450980392155])
('the max is:', [889.42999999999995, 840.25999999999999, 1817.3, 1432.0])
('the min is:', [271.47000000000003, 330.63999999999999, 734.66999999999996, 682.45000000000005])
('the std is:', [153.12671052918949, 143.54191962915758, 250.14568980088544, 176.08847930938069])
('the etendu is:', [617.95999999999992, 509.62, 1082.6300000000001, 749.54999999999995])

Tableau indiquand, pour chaque zone : la moyenne, l’écart type, le minimum, le maximum et l’étendue.

In [135]:
b = {'Zone1': [table_mean[0], table_max[0], table_min[0],table_std[0], table_etendu[0]],'Zone2': [table_mean[1], table_max[1], table_min[1],table_std[1], table_etendu[1]],'Zone3': [table_mean[2], table_max[2], table_min[2],table_std[2], table_etendu[2]],'Zone4': [table_mean[3], table_max[3], table_min[3],table_std[3], table_etendu[3]]}
dataT = pd.DataFrame(data = b)
dataT.index = ['Moyenne','Maximum','Minimum','l’écart type','l’étendue']
dataT
Out[135]:
Zone1 Zone2 Zone3 Zone4
Moyenne 513.044902 546.448235 1146.146863 1007.945098
Maximum 889.430000 840.260000 1817.300000 1432.000000
Minimum 271.470000 330.640000 734.670000 682.450000
l’écart type 153.126711 143.541920 250.145690 176.088479
l’étendue 617.960000 509.620000 1082.630000 749.550000

Figure des courbes de chaque variable

In [129]:
#print(df["Zone3"])
plt.plot (df['Years'] , df["Zone1"] , color = "cyan", label = 'Zone1') 
plt.plot (df['Years'] , df["Zone2"] , color = "red", label = 'Zone2')

plt.plot (df['Years'] , df["Zone3"] , color = "magenta", label = 'Zone3') 
plt.plot (df['Years'] , df["Zone4"] , color = "green", label = 'Zone4')
plt.legend(loc='best')
plt.title('Figure des courbes de chaque variable')
plt.xlabel('Annees')
#plt.ylabel('donnees')


#plt.ylabel('donnees')
#plt.ylabel('Anneés')
#plt.xlabel('donnees')
plt.show()
In [109]:
MEs_Z1 = table_mean[0] + table_std[0]
MEs_Z2 = table_mean[1] + table_std[1]
MEs_Z3 = table_mean[2] + table_std[2]
MEs_Z4 = table_mean[3] + table_std[3]
MEd_Z1 = table_mean[0] - table_std[0]
MEd_Z2 = table_mean[1] - table_std[1]
MEd_Z3 = table_mean[2] - table_std[2]
MEd_Z4 = table_mean[3] - table_std[3]

2°) les courbes des séries chronologique de pluie et leur encadrement à plus ou moins 1 écart type de la moyenne pour chaque zone.

In [130]:
#print(ME)
plt.plot (df['Years'] , df["Zone1"] , color = "cyan")
plt.axhline( MEs_Z1 , color = "blue", label = 'moyenne + ecart type')
plt.axhline( MEd_Z1 , color = "red", label = 'moyenne - ecart type')
plt.axhline( table_mean[0] , color = "green", label = 'Moyenne')
plt.legend(loc='upper right')
plt.title('Figure des courbes de la Zone 1')
plt.xlabel('Annees')
#plt.figure (figsize = (25 ,30))
plt.show()
In [131]:
plt.plot (df['Years'] , df["Zone2"] , color = "cyan")
plt.axhline( MEs_Z2 , color = "blue", label = 'moyenne + ecart type')
plt.axhline( MEd_Z2 , color = "red", label = 'moyenne - ecart type')
plt.axhline( table_mean[1] , color = "green", label = 'Moyenne')
plt.legend(loc='best')
plt.title('Figure des courbes de la Zone 2')
plt.xlabel('Annees')
plt.show()
In [132]:
plt.plot (df['Years'] , df["Zone3"] , color = "cyan")
plt.axhline( MEs_Z3 , color = "blue", label = 'moyenne + ecart type')
plt.axhline( MEd_Z3 , color = "red", label = 'moyenne - ecart type')
plt.axhline( table_mean[2] , color = "green", label = 'Moyenne')
plt.legend(loc='best')
plt.title('Figure des courbes de la Zone 3')
plt.xlabel('Annees')
plt.show()
In [136]:
plt.plot (df['Years'] , df["Zone4"] , color = "cyan")
plt.axhline( MEs_Z4 , color = "blue", label = 'moyenne + ecart type')
plt.axhline( MEd_Z4 , color = "red", label = 'moyenne - ecart type')
plt.axhline( table_mean[3] , color = "green", label = 'Moyenne')
plt.legend(loc='best')
plt.title('Figure des courbes de la Zone 4')
plt.xlabel('Annees')
plt.show()

3°) les quartiles (Q1, Q2, Q3) ainsi que l’écart interquartile (Q3-Q1) de chaque zone.

In [144]:
Q1_Z1 = round(np.percentile(df["Zone1"], 25))
Q2_Z1 = round(np.percentile(df["Zone1"], 50)) # return 50th percentile, e.g median.
Q3_Z1 = round(np.percentile(df["Zone1"], 75))

print('Q1 de Z1:',Q1_Z1)
print('Q2 de Z1:',Q2_Z1)
print('Q3 de Z1:',Q3_Z1)

Q1_Z2 = round(np.percentile(df["Zone2"], 25))
Q2_Z2 = round(np.percentile(df["Zone2"], 50) )# return 50th percentile, e.g median.
Q3_Z2 = round(np.percentile(df["Zone2"], 75))
print('Q1 de Z2:',Q1_Z2)
print('Q2 de Z2:',Q2_Z2)
print('Q3 de Z2:',Q3_Z2)
Q1_Z3 = round(np.percentile(df["Zone3"], 25))
Q2_Z3 = round(np.percentile(df["Zone3"], 50) )# return 50th percentile, e.g median.
Q3_Z3 = round(np.percentile(df["Zone3"], 75))
print('Q1 de Z3:',Q1_Z3)
print('Q2 de Z3:',Q2_Z3)
print('Q3 de Z3:',Q3_Z3)
Q1_Z4 = round(np.percentile(df["Zone4"], 25))
Q2_Z4 = round(np.percentile(df["Zone4"], 50) )# return 50th percentile, e.g median.
Q3_Z4 = round(np.percentile(df["Zone4"], 75))
print('Q1 de Z4:',Q1_Z4)
print('Q2 de Z4:',Q2_Z4)
print('Q3 de Z4:',Q3_Z4)
q = {'Zone1': [Q1_Z1, Q2_Z1, Q3_Z1, Q3_Z1 - Q1_Z1 ],'Zone2': [Q1_Z2, Q2_Z2, Q3_Z2,Q3_Z2 - Q1_Z2 ],'Zone3': [Q1_Z3, Q2_Z3, Q3_Z3, Q3_Z3 - Q1_Z3],'Zone4': [Q1_Z4,Q2_Z4,Q3_Z4, Q3_Z4 - Q1_Z4]}
Q_table = pd.DataFrame(data = q)
Q_table.index = ['Q1','Q2','Q3', 'Q3 - Q1']
Q_table
('Q1 de Z1:', 396.0)
('Q2 de Z1:', 487.0)
('Q3 de Z1:', 593.0)
('Q1 de Z2:', 419.0)
('Q2 de Z2:', 526.0)
('Q3 de Z2:', 640.0)
('Q1 de Z3:', 961.0)
('Q2 de Z3:', 1107.0)
('Q3 de Z3:', 1320.0)
('Q1 de Z4:', 899.0)
('Q2 de Z4:', 998.0)
('Q3 de Z4:', 1131.0)
Out[144]:
Zone1 Zone2 Zone3 Zone4
Q1 396.0 419.0 961.0 899.0
Q2 487.0 526.0 1107.0 998.0
Q3 593.0 640.0 1320.0 1131.0
Q3 - Q1 197.0 221.0 359.0 232.0
In [150]:
plt.boxplot([df["Zone1"],df["Zone2"],df["Zone3"],df["Zone4"]], whis = 'range')
plt.title('boxplot avec sequence')
#plt.boxplot(df["Zone2"])
#plt.boxplot(df["Zone3"])
#plt.boxplot(df["Zone4"])
Out[150]:
<matplotlib.text.Text at 0x7fe3cd4f69d0>
In [ ]: